*****************************************************************
* Replication directory for                                   ***
* Prime locations                                             ***
* by Gabriel M. Ahlfeldt, Thilo N.H. Albers, Kristian Behrens ***
* Published in American Economic Review: Insights             ***
*****************************************************************
* 01/2025
* Stata
version 17.0

* This do file calls Python geopandas to create US prime location shapefils for the GitHub toolkit
* Execution requires a Python working environment with packages mentioned at the beginning of the Python code
* Should the code not run through in Stata, the part between "python:" and "end" can be executed in Python
* In this case, you must manually set the root directory in line 35. 

* Create folder for shapefils
	 capture mkdir "$shapeoutput/US-CBSAs"
	 capture mkdir "$shapeoutput/US"
	 	 
	cd "$root"	 
* Create shapefiles by CBSA ****************************************************

python: 


import os
import pandas as pd
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.errors import TopologicalError
import numpy as np
import zipfile  # Use zipfile for creating zip files

# Set root directory to Stata's current working directory
root_directory = os.getcwd()

# File path for the MSA-list with CBSAFP identifiers
msa_list_file = os.path.join(root_directory, r'_Data/US METRO DATA/MSA-list.csv')

# Read the list of identifiers (CBSAFP) from the CSV file
msa_df = pd.read_csv(msa_list_file)
identifiers = msa_df['cbsafp'].tolist()  # Extract the list of identifiers

# File paths for prime location data
pl_input_folder = os.path.join(root_directory, r'_Work/TEMP/grid_PL_output_p99_5')
pl_output_folder = os.path.join(root_directory, r'_Work/Output/Shapes/US-CBSAs')

# Define the projection (EPSG:4326 corresponds to WGS 84 in decimal degrees)
crs = "EPSG:4326"

# Function to check if all coordinates are present and valid
def is_valid_coordinates(row):
    return all(pd.notna([row['lon_UL'], row['lat_UL'], row['lon_LR'], row['lat_LR']]))

# Function to calculate geographic buffer in degrees
def geographic_buffer(geometry, buffer_meters=50):
    # Calculate the latitude and longitude buffer sizes
    centroid = geometry.centroid
    lat = centroid.y  # Latitude of the centroid
    lat_buffer = buffer_meters / 111320  # Convert meters to degrees latitude
    lon_buffer = buffer_meters / (111320 * np.cos(np.radians(lat)))  # Degrees longitude
    
    # Apply the buffer using lat/lon approximations
    bounds = geometry.bounds  # (minx, miny, maxx, maxy)
    buffered = Polygon([
        (bounds[0] - lon_buffer, bounds[1] - lat_buffer),  # Bottom-left
        (bounds[0] - lon_buffer, bounds[3] + lat_buffer),  # Top-left
        (bounds[2] + lon_buffer, bounds[3] + lat_buffer),  # Top-right
        (bounds[2] + lon_buffer, bounds[1] - lat_buffer),  # Bottom-right
        (bounds[0] - lon_buffer, bounds[1] - lat_buffer)   # Close the polygon
    ])
    return buffered

# Loop over each identifier and process files
for identifier in identifiers:
    identifier_str = str(identifier)  # Ensure identifier is string
    
    # Construct the input file path for the prime location data
    pl_input_file = os.path.join(pl_input_folder, f'PL_grids_p99_5_{identifier_str}.csv')
    
    if not os.path.exists(pl_input_file):
        print(f"File not found: {pl_input_file}")
        continue  # Skip to the next identifier if not found
    
    # Load the prime location data
    pl_df = pd.read_csv(pl_input_file)

    # Filter rows with invalid or missing coordinates
    pl_df = pl_df[pl_df.apply(is_valid_coordinates, axis=1)]
    
    def create_polygon(row):
        try:
            # Create a Polygon for each grid cell using lon_UL, lat_UL, lon_LR, lat_LR
            return Polygon([
                (row['lon_UL'], row['lat_UL']),  # Upper Left
                (row['lon_LR'], row['lat_UL']),  # Upper Right
                (row['lon_LR'], row['lat_LR']),  # Lower Right
                (row['lon_UL'], row['lat_LR']),  # Lower Left
                (row['lon_UL'], row['lat_UL'])   # Close the polygon
            ])
        except (ValueError, TopologicalError) as e:
            print(f"Error creating polygon for row: {row['PLID']}, skipping. Error: {e}")
            return None

    # Create a Polygon for each row and handle errors gracefully
    pl_df['geometry'] = pl_df.apply(create_polygon, axis=1)

    # Drop rows where geometry creation failed (None values)
    pl_df = pl_df[pl_df['geometry'].notnull()]

    # Convert the DataFrame to a GeoDataFrame
    pl_gdf = gpd.GeoDataFrame(pl_df, geometry='geometry', crs=crs)
    
    # Apply a custom geographic buffer of 50 meters to each geometry
    pl_gdf['geometry'] = pl_gdf['geometry'].apply(geographic_buffer)
    
    # Group the grid cells by PLID and dissolve them into single polygons for each PLID
    pl_gdf = pl_gdf.dissolve(by='PLID')

    # Construct the output shapefile path for prime locations
    pl_shapefile_path = os.path.join(pl_output_folder, f'PL_{identifier_str}.shp')
    
    # Save the dissolved GeoDataFrame as a shapefile
    pl_gdf.to_file(pl_shapefile_path)
    
    print(f"Prime location shapefile saved at: {pl_shapefile_path}")

    # Create a zip file for the shapefile and its associated files
    zip_filename = os.path.join(pl_output_folder, f'shape_{identifier_str}.zip')
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for ext in ['.shp', '.shx', '.dbf', '.prj', '.cpg']:
            shapefile_component = pl_shapefile_path.replace('.shp', ext)
            if os.path.exists(shapefile_component):
                zipf.write(shapefile_component, arcname=os.path.basename(shapefile_component))
                print(f"Added {shapefile_component} to {zip_filename}")
    
    print(f"Zip file created: {zip_filename}")


	
end	

* Combine CBSA shapefiles into one *********************************************

python: 

import os
import pandas as pd
import geopandas as gpd
import zipfile  # For creating zip files

# Set root directory to Stata's current working directory
root_directory = os.getcwd()

# File path for the MSA-list containing the cbsafp identifiers
msa_list_file = os.path.join(root_directory, r'_Data/US METRO DATA/MSA-list.csv')

# Read the list of cbsafp identifiers from the CSV file
msa_df = pd.read_csv(msa_list_file)
identifiers = msa_df['cbsafp'].tolist()  # Extract the cbsafp list

# Folder where the individual shapefiles are saved
input_folder = os.path.join(root_directory, r'_Work/Output/Shapes/US-CBSAs')

# Folder for the combined shapefile_path
all_folder = os.path.join(root_directory, r'_Work/Output/Shapes/US')

# File path for the combined shapefile
combined_shapefile = os.path.join(all_folder, 'PL_all_USCBSAs.shp')

# Delete the old combined shapefile, if it exists, to avoid contamination
if os.path.exists(combined_shapefile):
    for ext in ['.shp', '.shx', '.dbf', '.cpg', '.prj']:
        old_file = combined_shapefile.replace('.shp', ext)
        if os.path.exists(old_file):
            os.remove(old_file)
            print(f"Deleted old file: {old_file}")

# List to store GeoDataFrames
gdf_list = []

# Ensure to clear any leftover data in memory
del gdf_list  # Remove any previous instances of gdf_list from memory
gdf_list = []  # Reinitialize to be sure

# Loop through the identifiers and load corresponding shapefiles
for identifier in identifiers:
    identifier_str = str(identifier)  # Ensure identifier is a string

    # Construct the shapefile path based on the identifier
    shapefile_path = os.path.join(input_folder, f'PL_{identifier_str}.shp')

    # Check if the shapefile exists
    if os.path.exists(shapefile_path):
        # Load the shapefile as a GeoDataFrame
        print(f"Loading shapefile: {shapefile_path}")
        gdf = gpd.read_file(shapefile_path)
        
        # Append the GeoDataFrame to the list
        gdf_list.append(gdf)
    else:
        print(f"Shapefile not found: {shapefile_path}")

# Concatenate all GeoDataFrames into one
if gdf_list:
    combined_gdf = gpd.GeoDataFrame(pd.concat(gdf_list, ignore_index=True))

    # Save the combined GeoDataFrame as a new shapefile
    combined_gdf.to_file(combined_shapefile)
    print(f"Combined shapefile saved at: {combined_shapefile}")

    # Create a zip file for the combined shapefile
    zip_filename = os.path.join(all_folder, 'shape_all.zip')
    with zipfile.ZipFile(zip_filename, 'w') as zipf:
        for ext in ['.shp', '.shx', '.dbf', '.cpg', '.prj']:
            shapefile_component = combined_shapefile.replace('.shp', ext)
            if os.path.exists(shapefile_component):
                zipf.write(shapefile_component, arcname=os.path.basename(shapefile_component))
                print(f"Added {shapefile_component} to {zip_filename}")
    
    print(f"Zip file created: {zip_filename}")
else:
    print("No shapefiles found to combine.")


end
* SCript ends
